In [1]:
import seaborn as sns
import pandas as pd
import plotly.express as px
import numpy as np
In [2]:
data = sns.load_dataset("tips")
data.head(5)
Out[2]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [3]:
# Goal: To train an AI to predict the tip from the total bill
In [4]:
features = data[['total_bill']]
tip = data['tip']

from sklearn import linear_model
f = linear_model.LinearRegression(fit_intercept = False)  # create linear regression model
f.fit(features,tip)
Out[4]:
LinearRegression(fit_intercept=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [5]:
f.predict([[100]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[5]:
array([14.37318953])
In [6]:
f.predict([[70]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[6]:
array([10.06123267])
In [7]:
# add predictions to dataframe for later plotting
data['prediction'] = f.predict(features)  # prediction is the predicted tip.
In [8]:
data
Out[8]:
total_bill tip sex smoker day time size prediction
0 16.99 1.01 Female No Sun Dinner 2 2.442005
1 10.34 1.66 Male No Sun Dinner 3 1.486188
2 21.01 3.50 Male No Sun Dinner 3 3.019807
3 23.68 3.31 Male No Sun Dinner 2 3.403571
4 24.59 3.61 Female No Sun Dinner 4 3.534367
... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 4.172537
240 27.18 2.00 Female Yes Sat Dinner 2 3.906633
241 22.67 2.00 Male Yes Sat Dinner 2 3.258402
242 17.82 1.75 Male No Sat Dinner 2 2.561302
243 18.78 3.00 Female No Thur Dinner 2 2.699285

244 rows × 8 columns

In [9]:
# summarize how good the model is
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['tip'],
                         mode = 'markers', name = 'actual'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction'],
                         mode = 'lines', name = 'predicted'))
fig.update_layout(font_size = 20)
# note: overlaying two plots on one pair of axes requires us to use this sort of code
In [10]:
# get coefficients and intercept from our sklearn model
f.coef_
Out[10]:
array([0.1437319])
In [11]:
f.intercept_
# we told the model intercept to be zero.
Out[11]:
0.0
In [12]:
# fit with intercept
f_w_int = linear_model.LinearRegression(fit_intercept = True)
f_w_int.fit(features,tip)
Out[12]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [13]:
f_w_int.predict([[100]])
# different from f.predict([[100]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Out[13]:
array([11.42272135])
In [14]:
f_w_int.predict([[0]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Out[14]:
array([0.92026961])
In [15]:
data['prediction_with_intercept'] = f_w_int.predict(data[['total_bill']])
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['tip'],
                         mode = 'markers', name = 'actual'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction'],
                         mode = 'lines', name = 'predicted(b=0)'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction_with_intercept'],
                         mode = 'lines', name = 'predicted(b != 0)'))
fig.update_layout(font_size = 20)
In [16]:
f_w_int.coef_
Out[16]:
array([0.10502452])
In [17]:
f_w_int.intercept_
Out[17]:
0.9202696135546731
In [18]:
# Our new model is predicting the tip is 0.1050*total_bill+0.92. In other works that humans tip 
# 92 cents, then add a 10.5 percent tip on top.
Or in x/y notation, our two models are:
1. y = 0.1437x
2. y = 0.92 + 0.105x
  Cell In[18], line 4
    1. y = 0.1437x
                ^
SyntaxError: invalid decimal literal
In [19]:
# Loss Functions
In [20]:
# Computing the L2 Loss and MSE
In [21]:
data = sns.load_dataset('tips')
data['prediction'] = f.predict(data[['total_bill']])
data.head(5)
Out[21]:
total_bill tip sex smoker day time size prediction
0 16.99 1.01 Female No Sun Dinner 2 2.442005
1 10.34 1.66 Male No Sun Dinner 3 1.486188
2 21.01 3.50 Male No Sun Dinner 3 3.019807
3 23.68 3.31 Male No Sun Dinner 2 3.403571
4 24.59 3.61 Female No Sun Dinner 4 3.534367
In [22]:
# add loss to the dataframe
data['l2_loss'] = (data['tip'] - data['prediction']) ** 2
data.head(5)
Out[22]:
total_bill tip sex smoker day time size prediction l2_loss
0 16.99 1.01 Female No Sun Dinner 2 2.442005 2.050638
1 10.34 1.66 Male No Sun Dinner 3 1.486188 0.030211
2 21.01 3.50 Male No Sun Dinner 3 3.019807 0.230585
3 23.68 3.31 Male No Sun Dinner 2 3.403571 0.008756
4 24.59 3.61 Female No Sun Dinner 4 3.534367 0.005720
In [23]:
# compute the mean
np.mean(data['l2_loss'])
Out[23]:
1.1781161154513171
In [24]:
data['l2_loss'].mean()
Out[24]:
1.1781161154513171
In [25]:
# To calculate the mean squared error in practice, use the mean squared error function provided by sklean.metrics
In [26]:
## or we can compute .MSE directly from the outcome and predictions using mean_squared_error
from sklearn.metrics import mean_squared_error
mean_squared_error(data['tip'],f.predict(data[['total_bill']]))
Out[26]:
1.1781161154513171
In [27]:
mean_squared_error(data['tip'],data['prediction'])
Out[27]:
1.1781161154513171
In [28]:
# Understanding that MSE is a Function of Oe Variable (Theta)
In [29]:
data
Out[29]:
total_bill tip sex smoker day time size prediction l2_loss
0 16.99 1.01 Female No Sun Dinner 2 2.442005 2.050638
1 10.34 1.66 Male No Sun Dinner 3 1.486188 0.030211
2 21.01 3.50 Male No Sun Dinner 3 3.019807 0.230585
3 23.68 3.31 Male No Sun Dinner 2 3.403571 0.008756
4 24.59 3.61 Female No Sun Dinner 4 3.534367 0.005720
... ... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 4.172537 3.053627
240 27.18 2.00 Female Yes Sat Dinner 2 3.906633 3.635249
241 22.67 2.00 Male Yes Sat Dinner 2 3.258402 1.583576
242 17.82 1.75 Male No Sat Dinner 2 2.561302 0.658212
243 18.78 3.00 Female No Thur Dinner 2 2.699285 0.090430

244 rows × 9 columns

In [30]:
# replace the prediction column and L2 loss columns with the corresponding values for 20% tip
data['prediction'] = data['total_bill'] * 0.2
data['l2_loss'] = (data['prediction'] - data['tip'])**2
data
Out[30]:
total_bill tip sex smoker day time size prediction l2_loss
0 16.99 1.01 Female No Sun Dinner 2 3.398 5.702544
1 10.34 1.66 Male No Sun Dinner 3 2.068 0.166464
2 21.01 3.50 Male No Sun Dinner 3 4.202 0.492804
3 23.68 3.31 Male No Sun Dinner 2 4.736 2.033476
4 24.59 3.61 Female No Sun Dinner 4 4.918 1.710864
... ... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 5.806 0.012996
240 27.18 2.00 Female Yes Sat Dinner 2 5.436 11.806096
241 22.67 2.00 Male Yes Sat Dinner 2 4.534 6.421156
242 17.82 1.75 Male No Sat Dinner 2 3.564 3.290596
243 18.78 3.00 Female No Thur Dinner 2 3.756 0.571536

244 rows × 9 columns

In [31]:
# compute the mean_squared_error for 20% tip
mean_squared_error(data['prediction'], data['tip'])
Out[31]:
2.667486278688525
In [32]:
# computer error for a 20% tip on one line (without creating prediction and l2 loss column)
mean_squared_error(data['total_bill'] * 0.2, data['tip'])
Out[32]:
2.667486278688525
In [33]:
mean_squared_error(data['total_bill'] * 0.08, data['tip'])
Out[33]:
3.08881276852459
In [34]:
mean_squared_error(data['total_bill'] * 0.3, data['tip'])
Out[34]:
12.66543732377049
In [35]:
# define a function mse_given_theta as a function which takes theto 
# and compute MSE
def mse_given_theta(theta):
    return mean_squared_error(data['total_bill'] * theta, data['tip'])
In [36]:
mse_given_theta(0.3)
Out[36]:
12.66543732377049
In [37]:
mse_given_theta(0.2)
Out[37]:
2.667486278688525
In [38]:
# create a list of thetas
thetas = np.linspace(0.1,0.2,100)
thetas
Out[38]:
array([0.1       , 0.1010101 , 0.1020202 , 0.1030303 , 0.1040404 ,
       0.10505051, 0.10606061, 0.10707071, 0.10808081, 0.10909091,
       0.11010101, 0.11111111, 0.11212121, 0.11313131, 0.11414141,
       0.11515152, 0.11616162, 0.11717172, 0.11818182, 0.11919192,
       0.12020202, 0.12121212, 0.12222222, 0.12323232, 0.12424242,
       0.12525253, 0.12626263, 0.12727273, 0.12828283, 0.12929293,
       0.13030303, 0.13131313, 0.13232323, 0.13333333, 0.13434343,
       0.13535354, 0.13636364, 0.13737374, 0.13838384, 0.13939394,
       0.14040404, 0.14141414, 0.14242424, 0.14343434, 0.14444444,
       0.14545455, 0.14646465, 0.14747475, 0.14848485, 0.14949495,
       0.15050505, 0.15151515, 0.15252525, 0.15353535, 0.15454545,
       0.15555556, 0.15656566, 0.15757576, 0.15858586, 0.15959596,
       0.16060606, 0.16161616, 0.16262626, 0.16363636, 0.16464646,
       0.16565657, 0.16666667, 0.16767677, 0.16868687, 0.16969697,
       0.17070707, 0.17171717, 0.17272727, 0.17373737, 0.17474747,
       0.17575758, 0.17676768, 0.17777778, 0.17878788, 0.17979798,
       0.18080808, 0.18181818, 0.18282828, 0.18383838, 0.18484848,
       0.18585859, 0.18686869, 0.18787879, 0.18888889, 0.18989899,
       0.19090909, 0.19191919, 0.19292929, 0.19393939, 0.19494949,
       0.1959596 , 0.1969697 , 0.1979798 , 0.1989899 , 0.2       ])
In [39]:
# compute MSEs for those thetas
mses = [mse_given_theta(theta) for theta in thetas]
mses
Out[39]:
[2.0777683729508194,
 2.0366887534058913,
 1.996569059699077,
 1.9574092918303747,
 1.919209449799786,
 1.8819695336073097,
 1.8456895432529465,
 1.8103694787366964,
 1.7760093400585586,
 1.7426091272185338,
 1.7101688402166224,
 1.678688479052823,
 1.6481680437271375,
 1.6186075342395636,
 1.5900069505901033,
 1.5623662927787565,
 1.5356855608055218,
 1.5099647546704,
 1.4852038743733909,
 1.461402919914495,
 1.4385618912937121,
 1.4166807885110417,
 1.3957596115664843,
 1.37579836046004,
 1.3567970351917082,
 1.3387556357614898,
 1.3216741621693837,
 1.3055526144153906,
 1.2903909924995107,
 1.2761892964217436,
 1.262947526182089,
 1.2506656817805475,
 1.2393437632171185,
 1.2289817704918033,
 1.2195797036046003,
 1.2111375625555103,
 1.2036553473445333,
 1.197133057971669,
 1.1915706944369175,
 1.1869682567402793,
 1.1833257448817533,
 1.1806431588613406,
 1.1789204986790405,
 1.1781577643348538,
 1.1783549558287796,
 1.1795120731608182,
 1.18162911633097,
 1.1847060853392344,
 1.1887429801856118,
 1.193739800870102,
 1.1996965473927048,
 1.206613219753421,
 1.21448981795225,
 1.2233263419891915,
 1.233122791864246,
 1.2438791675774135,
 1.255595469128694,
 1.2682716965180871,
 1.2819078497455934,
 1.2965039288112117,
 1.312059933714944,
 1.3285758644567889,
 1.3460517210367458,
 1.3644875034548163,
 1.3838832117109996,
 1.4042388458052961,
 1.4255544057377052,
 1.4478298915082273,
 1.471065303116862,
 1.4952606405636095,
 1.5204159038484695,
 1.5465310929714433,
 1.5736062079325297,
 1.6016412487317289,
 1.630636215369041,
 1.6605911078444662,
 1.6915059261580032,
 1.7233806703096541,
 1.7562153402994167,
 1.7900099361272936,
 1.824764457793283,
 1.8604789052973851,
 1.8971532786396006,
 1.9347875778199284,
 1.97338180283837,
 2.0129359536949227,
 2.053450030389589,
 2.0949240329223673,
 2.13735796129326,
 2.1807518155022656,
 2.225105595549384,
 2.270419301434615,
 2.316692933157959,
 2.3639264907194146,
 2.4121199741189843,
 2.461273383356666,
 2.5113867184324614,
 2.56245997934637,
 2.614493166098391,
 2.667486278688525]
In [40]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
    xaxis_title = 'theta',
    yaxis_title = 'MSE',
    font_size = 20
)
In [41]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
    xaxis_title = r'$\theta',
    yaxis_title = 'MSE',
    font_size = 20
)
In [42]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
    xaxis_title = r'$\huge(\theta)$',
    yaxis_title = 'MSE',
    font_size = 20
)

# The Plotly library supports LaTeX syntax
In [43]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
    xaxis_title = 'θ',
    yaxis_title = 'MSE',
    font_size = 20
)
# search from google, copy and paste
In [44]:
# The minimum mean squared error happens right where the scikit-learn model picked Theta.
In [45]:
# Using SciPy Optimize to Optimize L2 Loss
In [46]:
import scipy.optimize
In [47]:
# define g as the cube of x plus square of x minus 3 times x plus 2
def g(x):
    return x**3 + x**2 - 3*x + 2
In [48]:
g(12)
Out[48]:
1838
In [49]:
# use scipy.optimze. minimize and compare with wolfram alpha
scipy.optimize.minimize(g, x0 = 1000)
Out[49]:
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 0.7316461776530541
        x: [ 7.208e-01]
      nit: 20
      jac: [ 1.788e-07]
 hess_inv: [[ 1.580e-01]]
     nfev: 48
     njev: 24
In [50]:
#visualize g
px.line(x = np.linspace(-3,2,100), y =g(np.linspace(-3,2,100)))
# When the function is plotted, we see that the minimizing value is around 0.72.
In [51]:
scipy.optimize.minimize(mse_given_theta, x0 = 0.2)
Out[51]:
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.1781161154513287
        x: [ 1.437e-01]
      nit: 1
      jac: [ 2.384e-06]
 hess_inv: [[1]]
     nfev: 6
     njev: 3
In [52]:
# There are many minimization libraries that use various types of numerical techniques
In [53]:
# This minimization library can fail.
scipy.optimize.minimize(g, x0 = -3)
# The success flag comes up as False
# There is no true absolute minimum for this function
# So it will be important to pick a loss function that has a nice shape for optimization and a minimum 
Out[53]:
  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: -1114853117.349824
        x: [-1.037e+03]
      nit: 1
      jac: [ 3.226e+06]
 hess_inv: [[-3.206e-04]]
     nfev: 236
     njev: 112
In [54]:
# Multiple Linear Regression
In [55]:
data
Out[55]:
total_bill tip sex smoker day time size prediction l2_loss
0 16.99 1.01 Female No Sun Dinner 2 3.398 5.702544
1 10.34 1.66 Male No Sun Dinner 3 2.068 0.166464
2 21.01 3.50 Male No Sun Dinner 3 4.202 0.492804
3 23.68 3.31 Male No Sun Dinner 2 4.736 2.033476
4 24.59 3.61 Female No Sun Dinner 4 4.918 1.710864
... ... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 5.806 0.012996
240 27.18 2.00 Female Yes Sat Dinner 2 5.436 11.806096
241 22.67 2.00 Male Yes Sat Dinner 2 4.534 6.421156
242 17.82 1.75 Male No Sat Dinner 2 3.564 3.290596
243 18.78 3.00 Female No Thur Dinner 2 3.756 0.571536

244 rows × 9 columns

In [56]:
# fit a model on total_bill and size
features = data[['total_bill', 'size']]
tip = data['tip']

f2 = linear_model.LinearRegression(fit_intercept = False)
f2.fit(features, tip)
Out[56]:
LinearRegression(fit_intercept=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [57]:
# show coefficients
f2.coef_
# two coefficients, one for 'total_bill', the other for 'size'
Out[57]:
array([0.1007119 , 0.36209717])
In [58]:
# make a prediction for a table with $10 total bill and 3 people seated
f2.predict([[10,3]])
# predictions: f2 model: $2.09
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Out[58]:
array([2.09341054])
In [59]:
# previous model f
f.predict([[10]])
# f model: $1.43
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Out[59]:
array([1.43731895])
In [60]:
# the f2 model was trained on 2D data, so it can only make predictions on 2D data.
In [61]:
# If there are k parameters in a linear model, you need k features.
In [62]:
# compare preditions for f and f2 side by side in table
data['prediction'] = f.predict(data[['total_bill']])
data['prediction_2d'] = f2.predict(data[['total_bill', 'size']])                                    
In [63]:
data
Out[63]:
total_bill tip sex smoker day time size prediction l2_loss prediction_2d
0 16.99 1.01 Female No Sun Dinner 2 2.442005 5.702544 2.435290
1 10.34 1.66 Male No Sun Dinner 3 1.486188 0.166464 2.127653
2 21.01 3.50 Male No Sun Dinner 3 3.019807 0.492804 3.202249
3 23.68 3.31 Male No Sun Dinner 2 3.403571 2.033476 3.109052
4 24.59 3.61 Female No Sun Dinner 4 3.534367 1.710864 3.924894
... ... ... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 4.172537 0.012996 4.009958
240 27.18 2.00 Female Yes Sat Dinner 2 3.906633 11.806096 3.461544
241 22.67 2.00 Male Yes Sat Dinner 2 3.258402 6.421156 3.007333
242 17.82 1.75 Male No Sat Dinner 2 2.561302 3.290596 2.518880
243 18.78 3.00 Female No Thur Dinner 2 2.699285 0.571536 2.615564

244 rows × 10 columns

In [64]:
# so which is better, f or f2? You can use loss functions and compute the mean squared error(MSE)
In [65]:
# compare MSE for f and f2
mean_squared_error(data['prediction'],data['tip'])
Out[65]:
1.1781161154513171
In [66]:
mean_squared_error(data['prediction_2d'], data['tip'])
Out[66]:
1.06482122862577
In [67]:
# predictions: 1D model: 1.178, 2D model: 1.065
# 2D model is better since it gets a lower mean squared error. 
# the model gave higher-quality predictions when it had more information.
In [68]:
# Let explore what are the models acutally doing.
In [69]:
# make 3d plot of our data
px.scatter_3d(data, x = 'total_bill', y = 'size', z = 'tip')
# 3D plot: As the total bill goes up, the tip goes up; as the size goes up, the tip goes up.
In [70]:
# This code is not something we expect you to understand!
# It's just computing predictions for various bills and table sizes
table_bills, table_sizes = np.meshgrid(range(50), range(6))
tip_predictions = (0.1007119 * table_bills + 0.3621 * table_sizes)
In [71]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = data['total_bill'], y = data['size'], z = data['tip'],
                           mode = 'markers', name = 'actual'))
fig.add_trace(go.Surface(x = table_bills, y = table_sizes, z = tip_predictions, name = 'predicted'))
fig.show()

# 2D plot: The plane increases with both size the the tip; the model is finding the plane of best fit
In [72]:
f.coef_
Out[72]:
array([0.1437319])
In [73]:
f2.coef_
Out[73]:
array([0.1007119 , 0.36209717])
In [74]:
# Our 1D and 2D models as equation
# 1. tip = 0.1437 * bill
# 2. tip = 0.1*bill + 0.36*size
# Even though model 2 has lower MSE, model 1 is probably a better model of reality.
# Model 2 is overfitting.
# Model needs to make sense.
In [75]:
# Using Nonnumberic Features
In [76]:
data = sns.load_dataset('tips')
data.head(5)
Out[76]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [77]:
# create a copy of the dataset that only has 3 features in order to keep things simple
three_features = ['total_bill','size','day']
three_feature_data = pd.DataFrame(data[three_features])
three_feature_data.iloc[[193,90,25,26,190],:]
Out[77]:
total_bill size day
193 15.48 2 Thur
90 28.97 2 Fri
25 17.81 4 Sat
26 13.37 2 Sat
190 15.69 2 Sun
In [78]:
# Let's create 'dummies' that represent whether it is thursday, friday, saturday, or sunday
dummies = pd.get_dummies(three_feature_data['day'])
dummies.iloc[[193,90,25, 26,190], :]
Out[78]:
Thur Fri Sat Sun
193 True False False False
90 False True False False
25 False False True False
26 False False True False
190 False False False True
In [79]:
# concatenate the dummies table with three_feature_data
# pd.concat adds rows or columns to a data frame
data_w_dummies = pd.concat([three_feature_data, dummies], axis = 1)
data_w_dummies.iloc[[193,90,25,26,190],:]
Out[79]:
total_bill size day Thur Fri Sat Sun
193 15.48 2 Thur True False False False
90 28.97 2 Fri False True False False
25 17.81 4 Sat False False True False
26 13.37 2 Sat False False True False
190 15.69 2 Sun False False False True
In [80]:
# The code below will crash since data_w_dummies includes a non-numeric feature
f_with_day = linear_model.LinearRegreassion(fit_intercept=False)
f_with_day.fit(data_w_dummies, tip)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[80], line 2
      1 # The code below will crash since data_w_dummies includes a non-numeric feature
----> 2 f_with_day = linear_model.LinearRegreassion(fit_intercept=False)
      3 f_with_day.fit(data_w_dummies, tip)

AttributeError: module 'sklearn.linear_model' has no attribute 'LinearRegreassion'
In [81]:
# drop the non-numeric column
del data_w_dummies['day']
data_w_dummies.head(5)
Out[81]:
total_bill size Thur Fri Sat Sun
0 16.99 2 False False False True
1 10.34 3 False False False True
2 21.01 3 False False False True
3 23.68 2 False False False True
4 24.59 4 False False False True
In [82]:
# fit the model
f_with_day = linear_model.LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies, tip)
Out[82]:
LinearRegression(fit_intercept=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [83]:
# The model has been trined on numeric and nonnumeric features
In [84]:
f_with_day.coef_
Out[84]:
array([0.09299361, 0.18713231, 0.66829361, 0.74578683, 0.62112858,
       0.73228865])
In [85]:
# Tip prediction
# Size: 3
# Total bill: $50
# Day: Thursday
In [86]:
# Computing the value using our model
# Thursday: $5.88
# Saturday: $5.83
# Sunday: $5.94
In [87]:
# What has the AI learned?
In [88]:
# Evalulating the 6D model:
# It does slightly better than the 2D and 1D models.
# Dule to overfitting, it might perform worse on new observations
In [89]:
px.scatter(data, x='total_bill', y= 'tip', color = 'day', trendline = 'ols')
In [ ]: